# ==============================================================================
# name: RQ1-professionalism-prevalence.R
# date:	Jan 25, 2022
# author: Bernhard Clemm / Tiago Ventura
# purpose: summarize and plot prevalence of survey professionalism
# ==============================================================================

rm(list = ls())

source("code/utils/styles.R")

# READ IN, FILTER AND BIND DATA ================================================

# Seven-day filter:
## "Across samples, some donating subjects participated in the surveys but provided little
## browsing data. As such subjects would distort some of the proportional metrics we calculate—
## for example, someone who submitted five visits in total, all of which are to a survey site, would
## be treated as doing surveys 100 percent of the time—we exclude subjects who submitted data
## from less than seven days."

# Wave filter:
## To avoid bias of aggregate estimates towards those returning to later waves,
## we only use wave 1 data in LU and FB

fb <- read.csv("data/analysis_FB.csv")

lu <- read.csv("data/analysis_LU.csv")

yg <- read.csv("data/analysis_YG.csv")

## For continuous outcomes ####

visits_fb <- fb %>%
  filter(wave == 1) %>%
  filter(n_days_active >= 7) %>%
  mutate(person_id = as.character(person_id)) %>%
  select(
    dataset, person_id, weight,
    starts_with("n_"), # visit count variables
    starts_with("s_")
  ) # visit duration variables

visits_lu <- lu %>%
  filter(wave == 1) %>%
  filter(n_days_active >= 7) %>%
  mutate(person_id = as.character(person_id)) %>%
  select(
    dataset, person_id, weight,
    starts_with("n_"), # visit count variables
    starts_with("s_")
  ) # visit duration variables

visits_yg <- yg %>%
  filter(n_days_active >= 7) %>%
  mutate(person_id = as.character(person_id)) %>%
  select(
    dataset, person_id, weight, device_main,
    starts_with("n_"), # visit count variables
    starts_with("s_")
  ) # visit duration variables

visits <- bind_rows(visits_fb, visits_lu, visits_yg)

### Add proportional variables ####

visits <- visits %>%
  mutate(
    n_classified = n_survey + n_google + n_youtube + n_facebook + n_amazon,
    n_other = n_total - n_classified,
    s_classified_5_na = s_survey_5_na + s_google_5_na + s_youtube_5_na + s_facebook_5_na + s_amazon_5_na,
    s_other_5_na = s_total_5_na - s_classified_5_na
  ) %>%
  mutate(
    prop_survey = n_survey / n_total,
    prop_google = n_google / n_total,
    prop_youtube = n_youtube / n_total,
    prop_facebook = n_facebook / n_total,
    prop_amazon = n_amazon / n_total,
    prop_other = n_other / n_total,
    prop_survey_time = s_survey_5_na / s_total_5_na,
    prop_google_time = s_google_5_na / s_total_5_na,
    prop_youtube_time = s_youtube_5_na / s_total_5_na,
    prop_facebook_time = s_facebook_5_na / s_total_5_na,
    prop_amazon_time = s_amazon_5_na / s_total_5_na,
    prop_other_time = s_other_5_na / s_total_5_na
  )

### Incorporate weights ####

visits <- visits %>%
  ## some cases do not have a weight because no survey data; assign weight of 1
  mutate(weight = ifelse(is.na(weight), 1, weight)) %>%
  as_survey_design(weights = weight)

## For categorical outcomes ####
## We do not apply this wave filter for the analysis of categorical professionalism
## (There were a few subjects who particpated in W2 only due to technical error)

profs_fb <- fb %>%
  filter(n_days_active >= 7) %>%
  mutate(person_id = as.character(person_id)) %>%
  select(
    dataset, person_id, weight,
    starts_with("professional_")
  ) %>% # professionalism indicator
  # keep only one row for subjects with more than one wave
  distinct(person_id, .keep_all = T)

profs_lu <- lu %>%
  filter(n_days_active >= 7) %>%
  mutate(person_id = as.character(person_id)) %>%
  select(
    dataset, person_id, weight,
    starts_with("professional_")
  ) %>% # professionalism indicator
  # keep only one row for subjects with more than one wave
  distinct(person_id, .keep_all = T)

profs_yg <- yg %>%
  filter(n_days_active >= 7) %>%
  mutate(person_id = as.character(person_id)) %>%
  select(
    dataset, person_id, weight, device_main,
    starts_with("professional_")
  ) # professionalism indicator

profs <- bind_rows(profs_fb, profs_lu, profs_yg)

### Incorporate weights ####

profs <- profs %>%
  ## some cases do not have a weight because no survey data; assign weight of 1
  mutate(weight = ifelse(is.na(weight), 1, weight)) %>%
  as_survey_design(weights = weight)

# MAIN PAPER ===================================================================

## Fig 1: Percent of survey visits (out of all visits) ####
# What is the percentage of survey taking in the aggregate, across all subjects?

### Summarize by data set / visit type
agg_perc_visits <- visits %>%
  group_by(dataset) %>%
  dplyr::summarize(
    total = survey_total(n_total),
    survey = survey_total(n_survey),
    google = survey_total(n_google),
    youtube = survey_total(n_youtube),
    facebook = survey_total(n_facebook),
    amazon = survey_total(n_amazon),
    other = survey_total(n_other)
  ) %>%
  pivot_longer(total:other_se, names_to = "visit_type", values_to = "count") %>%
  group_by(dataset) %>%
  mutate(total = count[visit_type == "total"]) %>%
  filter(
    visit_type != "total",
    !str_detect(visit_type, "se")
  ) %>%
  mutate(perc = count * 100 / total)

### recode labels for plot
recode_labels_agg <- function(dt) {
  dt <- dt %>%
    mutate(
      visit_type = str_to_lower(visit_type),
      visit_type = case_when(
        !str_detect(visit_type, "survey|other") ~ paste0(visit_type, ".com"),
        visit_type == "survey" ~ "Survey sites",
        visit_type == "other" ~ "Other sites",
        .default = visit_type
      ),
      visit_type = factor(
        visit_type,
        ordered = TRUE,
        levels = c(
          "Survey sites", "amazon.com", "facebook.com",
          "google.com", "youtube.com", "Other sites"
        )
      )
    ) %>%
    mutate(alpha = ifelse(visit_type == "Survey sites", TRUE, FALSE))
  return(dt)
}

agg_perc_visits <- recode_labels_agg(agg_perc_visits)

### Plot
agg_perc_visits %>%
  ggplot(aes(
    fill = dataset, color = dataset, y = perc, x = factor(visit_type),
    alpha = alpha
  )) +
  geom_bar(position = "dodge", stat = "identity", linewidth = .5) +
  facet_wrap(~dataset) +
  scale_y_continuous(
    name = "Percent of all visits", limits = c(0, 60),
    labels = function(x) paste0(x, "%")
  ) +
  scale_fill_manual(values = c(facebook, lucid, yougov)) +
  scale_color_manual(values = c(facebook, lucid, yougov)) +
  scale_alpha_manual(values = c(.1, 1)) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 14),
    axis.title.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    panel.grid.major.x = element_blank(),
    legend.position = "bottom"
  ) +
  guides(fill = "none", color = "none", alpha = "none")

ggsave("output/fig1_rq1_agg_percent_survey_visits.pdf",
  width = 12, height = 8, units = "in",
  pointsize = 12, bg = "white"
)

## Fig 2: Individual level distribution of survey visits per respondent ####

visits %>%
  ggplot(aes(
    x = n_survey, y = dataset, fill = dataset, color = dataset,
    group = dataset, weight = weight
  )) +
  stat_density_ridges(
    quantile_lines = TRUE, quantiles = 2, alpha = .4,
    scale = 1, rel_min_height = 0.01
  ) +
  scale_fill_manual(values = c(facebook, lucid, yougov)) +
  scale_color_manual(values = c(facebook, lucid, yougov)) +
  guides(fill = "none", color = "none") +
  ylab("") +
  xlab("Total number of survey visits per respondent") +
  xlim(0, 5000)

ggsave("output/fig2_rq1_ind_percent_survey_visits_dist.pdf",
  width = 12, height = 8, units = "in",
  pointsize = 12, bg = "white"
)

## Fig 3: Percent of survey professionals for different definitions ####

### Summarize by data set / professional measure
perc_profs <- profs %>%
  group_by(dataset) %>%
  summarise(across(starts_with("professional_"), survey_mean, na.rm = TRUE)) %>%
  select(!ends_with("se")) %>%
  pivot_longer(starts_with("professional"), names_to = "method", values_to = "prevalence")

### recode labels for plot
recode_labels_profs <- function(dt) {
  dt <- dt %>%
    mutate(
      method = case_when(
        method == "professional_1" ~ "> 100 survey visits per day",
        method == "professional_2" ~ "> 50% of browsing visits",
        method == "professional_3" ~ "> 50% of browsing time",
        method == "professional_all" ~ "Any of the Categories"
      ),
      method = fct_relevel(method, c(
        "> 100 survey visits per day",
        "> 50% of browsing visits",
        "> 50% of browsing time",
        "Any of the Categories"
      ))
    )
}

perc_profs <- recode_labels_profs(perc_profs)

perc_profs %>%
  ggplot(aes(
    fill = method, y = prevalence * 100, x = method,
    label = paste0(round(prevalence * 100, 1), " %")
  )) +
  geom_bar(
    alpha = .8,
    position = position_dodge(width = 1),
    stat = "identity", width = .9, color = "black"
  ) +
  geom_text(
    position = position_dodge(width = 0.9),
    vjust = -1,
    size = 5,
    family = my_font, color = "black"
  ) +
  facet_wrap(~dataset) +
  scale_y_continuous(
    name = "Percent of Professionals Survey Takers", limits = c(0, 100),
    labels = function(x) paste0(x, "%")
  ) +
  scale_fill_manual(
    values = RColorBrewer::brewer.pal(4, "Blues"),
    name = "Measure:"
  ) +
  theme(
    axis.text.x = element_blank(),
    axis.title.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    panel.grid.major.x = element_blank(),
    legend.position = "bottom"
  ) +
  guides(fill = guide_legend(nrow = 2))

ggsave("output/fig3_rq1_percent_survey_profs.pdf",
  width = 12, height = 8, units = "in",
  pointsize = 12, bg = "white"
)

# SM C.1: UNWEIGHTED RESULTS =======================================================

## Fig C.2: Percent of survey visits (out of all visits), unweighted ####

agg_perc_visits_unweighted <- visits %>%
  group_by(dataset) %>%
  dplyr::summarize(
    total = sum(n_total),
    survey = sum(n_survey),
    google = sum(n_google),
    youtube = sum(n_youtube),
    facebook = sum(n_facebook),
    amazon = sum(n_amazon),
    other = sum(n_other)
  ) %>%
  pivot_longer(total:other, names_to = "visit_type", values_to = "count") %>%
  group_by(dataset) %>%
  mutate(total = count[visit_type == "total"]) %>%
  filter(
    visit_type != "total",
    !str_detect(visit_type, "se")
  ) %>%
  mutate(perc = count * 100 / total)

agg_perc_visits_unweighted <- recode_labels_agg(agg_perc_visits_unweighted)

agg_perc_visits_unweighted %>%
  ggplot(aes(
    fill = dataset, color = dataset, y = perc, x = factor(visit_type),
    alpha = alpha
  )) +
  geom_bar(position = "dodge", stat = "identity", size = .5) +
  facet_wrap(~dataset) +
  scale_y_continuous(
    name = "Percent of all visits", limits = c(0, 60),
    labels = function(x) paste0(x, "%")
  ) +
  scale_fill_manual(values = c(facebook, lucid, yougov)) +
  scale_color_manual(values = c(facebook, lucid, yougov)) +
  scale_alpha_manual(values = c(.1, 1)) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 14),
    axis.title.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    panel.grid.major.x = element_blank(),
    legend.position = "bottom"
  ) +
  guides(
    fill = "none",
    color = "none", alpha = "none"
  )

ggsave("output/figC2_rq1_agg_percent_survey_visits_unweighted.pdf",
  width = 12, height = 8, units = "in",
  pointsize = 12, bg = "white"
)

## Fig C.3: Individual level distribution of survey visits per respondent, unweighted ####

visits %>%
  ggplot(aes(
    x = n_survey, y = dataset, fill = dataset, color = dataset,
    group = dataset
  )) +
  stat_density_ridges(
    quantile_lines = TRUE, quantiles = 2, alpha = .4,
    scale = 1, rel_min_height = 0.01
  ) +
  scale_fill_manual(values = c(facebook, lucid, yougov)) +
  scale_color_manual(values = c(facebook, lucid, yougov)) +
  guides(fill = "none", color = "none") +
  ylab("") +
  xlab("Total number of survey visits per respondent") +
  xlim(0, 5000)

ggsave("output/figC3_rq1_ind_percent_survey_visits_dist_unweighted.pdf",
  width = 12, height = 8, units = "in",
  pointsize = 12, bg = "white"
)

## Fig C.4: Percent of survey professionals for different definitions, unweighted ####

perc_profs_unweighted <- profs %>%
  as.data.frame() %>%
  group_by(dataset) %>%
  summarise(across(starts_with("professional_"), mean, na.rm = TRUE)) %>%
  select(!ends_with("se")) %>%
  pivot_longer(starts_with("professional"), names_to = "method", values_to = "prevalence")

perc_profs_unweighted <- recode_labels_profs(perc_profs_unweighted)

perc_profs_unweighted %>%
  ggplot(aes(
    fill = method, y = prevalence * 100, x = method,
    label = paste0(round(prevalence * 100, 2), " %")
  )) +
  geom_bar(
    alpha = .8,
    position = position_dodge(width = 1),
    stat = "identity", width = .9, color = "black"
  ) +
  geom_text(
    position = position_dodge(width = 0.9),
    vjust = -1,
    size = 5,
    family = my_font, color = "black"
  ) +
  facet_wrap(~dataset) +
  scale_y_continuous(
    name = "Percent of Professionals Survey Takers", limits = c(0, 100),
    labels = function(x) paste0(x, "%")
  ) +
  scale_fill_manual(
    values = RColorBrewer::brewer.pal(4, "Blues"),
    name = "Method:"
  ) +
  theme(
    axis.text.x = element_blank(),
    axis.title.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    panel.grid.major.x = element_blank(),
    legend.position = "bottom"
  ) +
  guides(fill = guide_legend(nrow = 2))

ggsave("output/figC4_rq1_percent_survey_profs_unweighted.pdf",
       width = 12, height = 8, units = "in",
       pointsize = 12, bg = "white"
)

# SM C.2: DISAGGREGATION BY DESKTOP/MOBILE USERS ===============================

## Fig C.5: Percent of survey visits (out of all visits), desktop vs. mobile YouGov ####

perc_visits_agg_device <- visits %>%
  filter(dataset == "YouGov") %>%
  drop_na(device_main) %>%
  group_by(device_main) %>%
  dplyr::summarize(
    total = survey_total(n_total),
    survey = survey_total(n_survey),
    google = survey_total(n_google),
    youtube = survey_total(n_youtube),
    facebook = survey_total(n_facebook),
    amazon = survey_total(n_amazon),
    other = survey_total(n_other)
  ) %>%
  pivot_longer(total:other_se, names_to = "visit_type", values_to = "count") %>%
  group_by(device_main) %>%
  mutate(total = count[visit_type == "total"]) %>%
  filter(
    visit_type != "total",
    !str_detect(visit_type, "se")
  ) %>%
  mutate(perc = count * 100 / total)

perc_visits_agg_device <- recode_labels_agg(perc_visits_agg_device)

perc_visits_agg_device %>%
  filter(device_main != "Desktop/Mobile") %>%
  ggplot(aes(
    fill = device_main, color = device_main, y = perc, x = factor(visit_type),
    alpha = alpha
  )) +
  geom_bar(position = "dodge", stat = "identity", size = .5) +
  facet_wrap(~device_main) +
  scale_y_continuous(
    name = "Percent of all visits", limits = c(0, 60),
    labels = function(x) paste0(x, "%")
  ) +
  scale_fill_manual(values = c("grey20", "grey20")) +
  scale_color_manual(values = c("grey20", "grey20")) +
  scale_alpha_manual(values = c(.1, 1)) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 14),
    panel.grid.minor.x = element_blank(),
    axis.title.x = element_blank(),
    panel.grid.major.x = element_blank(),
    legend.position = "bottom"
  ) +
  guides(fill = "none", color = "none", alpha = "none")

ggsave("output/figC5_rq1_percent_survey_profs_device.pdf",
  width = 12, height = 8, units = "in",
  pointsize = 12, bg = "white"
)

## Fig C.6: Individual level distribution of survey visits per respondent, desktop vs. mobile YouGov ####

visits %>%
  filter(device_main != "Desktop/Mobile") %>%
  ggplot(aes(x = n_survey, y = device_main, fill = device_main, color = device_main)) +
  stat_density_ridges(
    quantile_lines = TRUE,
    quantiles = 2,
    alpha = .4,
    jittered_points = TRUE,
    point_size = 1, point_alpha = .1,
    position = position_raincloud(adjust_vlines = TRUE),
    scale = .7,
    rel_min_height = 0.01
  ) +
  scale_fill_manual(
    values = c("grey20", "grey20"),
    name = "",
    guide = guide_legend(reverse = TRUE, title.position = "top")
  ) +
  scale_color_manual(
    values = c("grey20", "grey20"),
    name = "",
    guide = guide_legend(reverse = TRUE, title.position = "top")
  ) +
  guides(fill = "none", color = "none") +
  labs() +
  ylab("") +
  xlab("Total number of survey visits per respondent") +
  xlim(0, 5000)

ggsave("output/figC6_rq1_ind_percent_survey_visits_dist_device.pdf",
  width = 12, height = 8, units = "in",
  pointsize = 12, bg = "white"
)


## Fig C.7: Percent of survey professionals for different definitions, desktop vs. mobile YouGov ####

perc_profs_device <- profs %>%
  group_by(device_main) %>%
  summarise(across(starts_with("professional_"), survey_mean, na.rm = TRUE)) %>%
  select(!ends_with("se")) %>%
  pivot_longer(starts_with("professional"), names_to = "method", values_to = "prevalence")

perc_profs_device <- recode_labels_profs(perc_profs_device)

perc_profs_device %>%
  filter(device_main != "Desktop/Mobile") %>%
  ggplot(aes(
    fill = method, y = prevalence * 100, x = method,
    label = paste0(round(prevalence * 100, 2), " %")
  )) +
  geom_bar(
    alpha = .8,
    position = position_dodge(width = 1),
    stat = "identity", width = .9, color = "black"
  ) +
  geom_text(
    position = position_dodge(width = 0.9),
    vjust = -1,
    size = 5,
    family = my_font, color = "black"
  ) +
  facet_wrap(~device_main) +
  scale_y_continuous(
    name = "Percent of Professionals Survey Takers", limits = c(0, 100),
    labels = function(x) paste0(x, "%")
  ) +
  scale_fill_manual(
    values = RColorBrewer::brewer.pal(4, "Blues"),
    name = "Method:"
  ) +
  theme(
    axis.text.x = element_blank(),
    axis.title.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    panel.grid.major.x = element_blank(),
    legend.position = "bottom"
  ) +
  guides(fill = guide_legend(nrow = 2))

ggsave("output/figC7_rq1_percent_survey_profs_device.pdf",
  width = 12, height = 8, units = "in",
  pointsize = 12, bg = "white"
)

# SM C.3: DISAGGREGATION BY IDENTIFICATION APPROACHES ==========================

## Fig C.8: Percent of survey visits (out of all visits), by method ####

perc_visits_agg_approaches <- visits %>%
  mutate(across(
    c(
      n_survey_url, n_bevec,
      n_manual_labelling1, n_manual_labelling2
    ),
    ~ ifelse(is.na(.), 0, .)
  )) %>%
  group_by(dataset) %>%
  dplyr::summarize(
    total = survey_total(n_total),
    survey_url = survey_total(n_survey_url),
    bevec = survey_total(n_bevec),
    manual_labelling1 = survey_total(n_manual_labelling1),
    manual_labelling2 = survey_total(n_manual_labelling2),
    google = survey_total(n_google),
    youtube = survey_total(n_youtube),
    facebook = survey_total(n_facebook),
    amazon = survey_total(n_amazon),
    other = survey_total(n_other)
  ) %>%
  pivot_longer(total:other_se, names_to = "visit_type", values_to = "count") %>%
  group_by(dataset) %>%
  mutate(total = count[visit_type == "total"]) %>%
  filter(
    visit_type != "total",
    !str_detect(visit_type, "se")
  ) %>%
  mutate(perc = count * 100 / total)

perc_visits_agg_approaches <- perc_visits_agg_approaches %>%
  mutate(
    visit_type = case_when(
      str_detect(visit_type, "amazon|google|facebook|youtube") ~ paste0(visit_type, ".com"),
      visit_type == "other" ~ "Other sites",
      visit_type == "bevec" ~ "Bevec & Vehovar sites",
      visit_type == "survey_url" ~ "URL host contains 'survey'",
      visit_type == "manual_labelling1" ~ "Manual Labelling (survey-only site)",
      visit_type == "manual_labelling2" ~ "Manual Labelling (get-paid-to site)"
    ),
    visit_type = fct_relevel(visit_type, c(
      "Manual Labelling (survey-only site)",
      "Manual Labelling (get-paid-to site)",
      "URL host contains 'survey'",
      "Bevec & Vehovar sites",
      "amazon.com",
      "facebook.com",
      "google.com",
      "youtube.com",
      "Other sites"
    ))
  )

perc_visits_agg_approaches %>%
  ggplot(aes(fill = visit_type, y = perc, x = factor(visit_type))) +
  geom_bar(position = "dodge", stat = "identity", color = "black", size = .1) +
  facet_wrap(~dataset) +
  scale_y_continuous(
    name = "Percent of All Visits", limits = c(0, 60),
    labels = function(x) paste0(x, "%")
  ) +
  scale_fill_manual(
    values = c(
      "#000000", "#252525", "#525252", "#737373", "#969696", "#BDBDBD", "#D9D9D9", "#F0F0F0", "#FFFFFF"
    ),
    na.translate = F
  ) +
  theme(
    axis.title.x = element_blank(),
    axis.text.x = element_blank(),
    legend.title = element_blank(),
    panel.grid.minor.x = element_blank(),
    panel.grid.major.x = element_blank(),
    legend.position = "bottom"
  ) +
  guides(fill = guide_legend(nrow = 3))

ggsave("output/figC8_rq1_agg_percent_survey_visits_approaches.pdf",
  width = 12, height = 8, units = "in",
  pointsize = 12, bg = "white"
)

## Fig C.9: Ten most frequented survey sites ####

visits_by_hosts_fb <- read.csv("data/browsing_hosts/hosts_popular_FB.csv") %>%
  mutate(dataset = "Facebook")
visits_by_hosts_lu <- read.csv("data/browsing_hosts/hosts_popular_LU.csv") %>%
  mutate(dataset = "Lucid")
visits_by_hosts_yg <- read.csv("data/browsing_hosts/hosts_popular_YG.csv") %>%
  mutate(dataset = "YouGov")

visits_by_hosts_all <- bind_rows(
  visits_by_hosts_fb, visits_by_hosts_lu, visits_by_hosts_yg
)

total_by_host <- visits_by_hosts_all %>%
  group_by(dataset) %>%
  mutate(total = sum(ct), share_hosts = ct / total * 100)

top_hosts <- total_by_host %>%
  group_by(dataset) %>%
  slice_max(order_by = share_hosts, n = 11)

reorder_within <- function(x, by, within, fun = median, sep = "___") {
  new_x <- paste(x, within, sep = sep)
  stats::reorder(new_x, by, FUN = fun)
}
scale_y_reordered <- function(...) {
  ggplot2::scale_y_discrete(labels = function(x) sub("___.*$", "", x), ...)
}

# 4. Define your brand-colors
my_cols <- c(
  "Facebook" = facebook,
  "Lucid"    = lucid,
  "YouGov"   = yougov
)

# 5. Single ggplot with facet_wrap
top_hosts %>%
  ggplot(aes(
    x = share_hosts,
    y = reorder_within(url_host, share_hosts, dataset),
    fill = dataset
  )) +
  geom_col(color = "black") +
  geom_text(aes(label = url_host),
    hjust = -0.1, size = 4
  ) +
  facet_wrap(~dataset, scales = "free_y", ncol = 3) +
  scale_y_reordered() +
  scale_x_continuous(
    "Percentage of survey visits by host",
    labels = ~ paste0(.x, "%"),
    limits = c(0, 30)
  ) +
  scale_fill_manual(values = my_cols, guide = "none") +
  theme(
    strip.text         = element_text(face = "bold"),
    axis.title.y       = element_blank(),
    axis.text.y        = element_blank(),
    panel.grid.major.y = element_blank(),
    panel.grid.minor.y = element_blank()
  )

ggsave("output/figC9_rq1_survey_visits_top_hosts.pdf",
  width = 12, height = 8, units = "in",
  pointsize = 12, bg = "white"
)

# SM C.4: AVERAGE INDIVIDUAL-LEVEL PERCENTAGE ==================================

## Fig C.10: Average individual-level percent of visits to survey sites out of all visits. ####

avg_perc <- visits %>%
  group_by(dataset) %>%
  dplyr::summarize(
    survey = survey_mean(prop_survey) * 100,
    google = survey_mean(prop_google) * 100,
    youtube = survey_mean(prop_youtube) * 100,
    facebook = survey_mean(prop_facebook) * 100,
    amazon = survey_mean(prop_amazon) * 100,
    other = survey_mean(prop_other) * 100
  ) %>%
  pivot_longer(survey:other, names_to = "visit_type", values_to = "perc") %>%
  filter(!str_detect(visit_type, "_se"))

avg_perc <- recode_labels_agg(avg_perc)

avg_perc %>%
  ggplot(aes(
    fill = dataset, color = dataset, y = perc, x = factor(visit_type),
    alpha = alpha
  )) +
  geom_bar(position = "dodge", stat = "identity", size = .5) +
  facet_wrap(~dataset) +
  scale_y_continuous(
    name = "Average individual-level percent of visits", limits = c(0, 65),
    labels = function(x) paste0(x, "%")
  ) +
  scale_fill_manual(values = c(facebook, lucid, yougov)) +
  scale_color_manual(values = c(facebook, lucid, yougov)) +
  scale_alpha_manual(values = c(.1, 1)) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 14),
    axis.text.y = element_text(size = 16),
    axis.title.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    panel.grid.major.x = element_blank(),
    legend.position = "bottom"
  ) +
  guides(
    fill = FALSE,
    color = FALSE, alpha = FALSE
  )

ggsave("output/figC10_rq1_ind_percent_survey_visits.pdf",
  width = 12, height = 8, units = "in",
  pointsize = 12, bg = "white"
)

# SM C.5: DURATION =================================================================

## Fig C.11: Percent of visit time to survey sites (out of all browsing time). ####

agg_perc_time <- visits %>%
  group_by(dataset) %>%
  dplyr::summarize(
    total = survey_total(s_total_5_na, na.rm = T),
    survey = survey_total(s_survey_5_na, na.rm = T),
    google = survey_total(s_google_5_na, na.rm = T),
    youtube = survey_total(s_youtube_5_na, na.rm = T),
    facebook = survey_total(s_facebook_5_na, na.rm = T),
    amazon = survey_total(s_amazon_5_na, na.rm = T),
    other = survey_total(s_other_5_na, na.rm = T)
  ) %>%
  pivot_longer(total:other_se, names_to = "visit_type", values_to = "duration") %>%
  group_by(dataset) %>%
  mutate(total = duration[visit_type == "total"]) %>%
  filter(
    visit_type != "total",
    !str_detect(visit_type, "se")
  ) %>%
  mutate(perc = duration * 100 / total)

agg_perc_time <- recode_labels_agg(agg_perc_time)

agg_perc_time %>%
  ggplot(aes(
    fill = dataset, color = dataset, y = perc, x = factor(visit_type),
    alpha = alpha
  )) +
  geom_bar(position = "dodge", stat = "identity", linewidth = .5) +
  facet_wrap(~dataset) +
  scale_y_continuous(
    name = "Percent of all visits", limits = c(0, 70),
    labels = function(x) paste0(x, "%")
  ) +
  scale_fill_manual(values = c(facebook, lucid, yougov)) +
  scale_color_manual(values = c(facebook, lucid, yougov)) +
  scale_alpha_manual(values = c(.1, 1)) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 14),
    axis.title.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    panel.grid.major.x = element_blank(),
    legend.position = "bottom"
  ) +
  guides(fill = "none", color = "none", alpha = "none")

ggsave("output/figC11_rq1_agg_percent_survey_duration.pdf",
  width = 12, height = 8, units = "in",
  pointsize = 12, bg = "white"
)

## Fig C.12: Distribution of individual-level time spent on survey sites ####

visits %>%
  ggplot(aes(x = s_survey_5_na, y = dataset, fill = dataset, color = dataset)) +
  stat_density_ridges(
    quantile_lines = TRUE,
    quantiles = 2,
    alpha = .4,
    jittered_points = TRUE,
    point_size = 1, point_alpha = .1,
    position = position_raincloud(adjust_vlines = TRUE),
    scale = .7,
    rel_min_height = 0.01
  ) +
  scale_fill_manual(values = c(facebook, lucid, yougov)) +
  scale_color_manual(values = c(facebook, lucid, yougov)) +
  guides(fill = "none", color = "none") +
  ylab("") +
  xlab("Total time (seconds) spent on survey sites per respondent") +
  xlim(0, 200000)

ggsave("output/figC12_rq1_ind_percent_survey_duration_dist.pdf",
  width = 12, height = 8, units = "in",
  pointsize = 12, bg = "white"
)


## Fig C.13: Individual level distribution of time spent on survey visits per respondent ####

avg_perc_time <- visits %>%
  group_by(dataset) %>%
  dplyr::summarize(
    survey = survey_mean(prop_survey_time, na.rm = T) * 100,
    google = survey_mean(prop_google_time, na.rm = T) * 100,
    youtube = survey_mean(prop_youtube_time, na.rm = T) * 100,
    facebook = survey_mean(prop_facebook_time, na.rm = T) * 100,
    amazon = survey_mean(prop_amazon_time, na.rm = T) * 100,
    other = survey_mean(prop_other_time, na.rm = T) * 100
  ) %>%
  pivot_longer(survey:other, names_to = "visit_type", values_to = "perc") %>%
  filter(!str_detect(visit_type, "_se"))

avg_perc_time <- recode_labels_agg(avg_perc_time)

avg_perc_time %>%
  ggplot(aes(
    fill = dataset, color = dataset, y = perc, x = factor(visit_type),
    alpha = alpha
  )) +
  geom_bar(position = "dodge", stat = "identity", size = .5) +
  facet_wrap(~dataset) +
  scale_y_continuous(
    name = "Average individual-level percent of visits", limits = c(0, 65),
    labels = function(x) paste0(x, "%")
  ) +
  scale_fill_manual(values = c(facebook, lucid, yougov)) +
  scale_color_manual(values = c(facebook, lucid, yougov)) +
  scale_alpha_manual(values = c(.1, 1)) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 14),
    axis.text.y = element_text(size = 16),
    axis.title.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    panel.grid.major.x = element_blank(),
    legend.position = "bottom"
  ) +
  guides(
    fill = FALSE,
    color = FALSE, alpha = FALSE
  )

ggsave("output/figC13_rq1_ind_percent_survey_duration.pdf",
  width = 12, height = 8, units = "in",
  pointsize = 12, bg = "white"
)

# SM C.6: SENSITIVITY ANALYSIS =================================================================

survey_fb_all <- read.csv("data/surveys_processed/survey_FB_all.csv") %>%
  mutate(person_id = as.character(id))

survey_lu_all <- read.csv("data/surveys_processed/survey_LU_all.csv") %>%
  mutate(person_id = as.character(person_id))

survey_yg_all <- read.csv("data/surveys_processed/survey_YG_all.csv") %>%
  mutate(person_id = as.character(person_id))

## Fig C.14: Attrition rate of donors on non-donors on YouGov ####

## have the full survey data on the left
people_visits_yg_attrition <- survey_yg_all %>%
  left_join(., visits_yg %>% select(-weight)) %>%
  # exclude those that only donated survey data, but did not participate in any survey
  # as we attrition does not have meaning for those
  filter(!is.na(endtime)) %>%
  mutate(
    complete_num = case_when(
      is.na(endtime_w2) ~ 0,
      !is.na(endtime_w2) ~ 1
    ),
  )

model_attrition <- lm(complete_num ~ donor, data = people_visits_yg_attrition)

conf_int <- function(data, estimate, std) {
  data %>%
    mutate(
      up90 = {{ estimate }} + 1.64 * {{ std }},
      up95 = {{ estimate }} + 1.96 * {{ std }},
      lb90 = {{ estimate }} - 1.64 * {{ std }},
      lb95 = {{ estimate }} - 1.96 * {{ std }}
    )
}

predictions_attrition <- avg_predictions(model_attrition, by = c("donor")) %>%
  as_tibble() %>%
  conf_int(estimate, std.error) %>%
  mutate(donor = c("Non-Donors", "Donors"))

predictions_attrition %>%
  ggplot(aes(
    y = donor,
    x = estimate,
    label = round(estimate, 2)
  )) +
  geom_col(alpha = .5) +
  geom_errorbar(
    aes(
      xmin = lb90,
      xmax = up90
    ),
    size = 1, width = 0, alpha = 1,
    position = position_dodge(width = .3)
  ) +
  geom_errorbar(
    aes(
      xmin = lb95,
      xmax = up95
    ),
    size = .5, width = .1, alpha = 1,
    position = position_dodge(width = .3)
  ) +
  xlab("Completion rate in wave 2") +
  ylab("") +
  coord_flip() +
  xlim(0, 1)

ggsave("output/figC14_rq1_sensitivity_attrition.pdf",
  width = 12, height = 8, units = "in",
  pointsize = 12, bg = "white"
)

## Fig C.15: Sensitivity analysis for the prevalence of survey professionals ####

# cf. Rosenbaum sensitivity analysis:

calculate_p <- function(p_d, d, change) {
  # p_d = share of professional among donors
  # p_nd = share of professional among non-donors
  # change = p_d/p_nd
  # nd = sample size of non-donors
  # d = sample size of donors

  p_nd <- p_d * change

  nd <- 1 - d

  p_d * d + p_nd * nd
}

perc_splits <- perc_profs %>%
  group_by(dataset) %>%
  group_split()

# share of professionals in each group
p_fb <- perc_splits[[1]]$prevalence
p_lu <- perc_splits[[2]]$prevalence
p_yg <- perc_splits[[3]]$prevalence

# share of donors in each group
n_fb <- table(survey_fb_all$donor)[2] / nrow(survey_fb_all)
n_lu <- table(survey_lu_all$donor)[2] / nrow(survey_lu_all)
n_yg <- table(survey_yg_all$donor)[2] / nrow(survey_yg_all)

professionalism <- perc_splits[[1]]$method

# "We vary the share of professional among non-donors from being equal
# to the number of professionals among donors (0% change) to up to
# 50% more or 50% fewer survey professionals."

## get lower bound ####

res_lower <- list()

changes <- seq(.50, 1, by = .10)

for (i in 1:length(p_fb)) {
  values_fb <- tibble(
    values = unlist(map(changes, ~ calculate_p(p_fb[[i]], n_fb, .x))),
    dataset = "facebook",
    method = professionalism[[i]],
    bounds = changes
  )

  values_lu <- tibble(
    values = unlist(map(changes, ~ calculate_p(p_lu[[i]], n_lu, .x))),
    dataset = "Lucid",
    method = professionalism[[i]],
    bounds = changes
  )

  values_yg <- tibble(
    values = unlist(map(changes, ~ calculate_p(p_yg[[i]], n_yg, .x))),
    dataset = "YouGov",
    method = professionalism[[i]],
    bounds = changes
  )

  res_lower[[i]] <- bind_rows(values_fb, values_lu, values_yg)
}

res_lower <- res_lower %>%
  bind_rows() %>%
  mutate(bound = abs(bounds - 1)) %>%
  select(-bounds) # %>%

## get upper bound ####

changes <- seq(1.5, 1, by = -.1)

res_upper <- list()
for (i in 1:length(p_fb)) {
  values_fb <- tibble(
    values = unlist(map(changes, ~ calculate_p(p_fb[[i]], n_fb, .x))),
    dataset = "Facebook",
    method = professionalism[[i]],
    bounds = changes
  )

  values_lu <- tibble(
    values = unlist(map(changes, ~ calculate_p(p_lu[[i]], n_lu, .x))),
    dataset = "Lucid",
    method = professionalism[[i]],
    bounds = changes
  )

  values_yg <- tibble(
    values = unlist(map(changes, ~ calculate_p(p_yg[[i]], n_yg, .x))),
    dataset = "YouGov",
    method = professionalism[[i]],
    bounds = changes
  )

  res_upper[[i]] <- bind_rows(values_fb, values_lu, values_yg)
}

res_upper <- res_upper %>%
  bind_rows() %>%
  mutate(bound = abs(bounds - 1)) %>%
  select(-bounds)

all_bounds <- bind_cols(
  res_upper %>% select(values_upper = values, everything()),
  res_lower %>% select(values_lower = values)
) %>%
  mutate(alpha = 1 - bound) %>%
  left_join(res_upper %>%
    filter(bound == 0) %>% select(-bound))

ggplot(
  all_bounds,
  aes(
    y = values * 100,
    x = method
  )
) +
  geom_point(
    position = position_dodge(width = 1),
    stat = "identity", color = "black",
    size = 3, shape = 21
  ) +
  geom_errorbar(aes(ymin = values_lower * 100, ymax = values_upper * 100, alpha = bound),
    width = 0, size = 3, color = "black"
  ) +
  facet_grid(dataset ~ .) +
  coord_flip() +
  scale_y_continuous(
    name = "Estimated percent of survey professional",
    limits = c(0, 105),
    labels = function(x) paste0(x, "%")
  ) +
  scale_alpha_continuous(
    range = c(1, .2),
    name = "Change in share of professionals among non-donors",
    labels = function(x) paste0(x * 100, "%")
  ) +
  guides(alpha = guide_legend(nrow = 1, title.position = "top")) +
  labs(x = "Measure of professionalism") +
  theme(
    axis.text.y = element_text(size = 12),
    panel.grid.minor.x = element_blank(),
    panel.grid.major.x = element_blank(),
    legend.position = "bottom",
    legend.title = element_text(hjust = 0.5), # Center the legend title
    legend.text = element_text(hjust = 0.5)
  )

ggsave("output/figC15_rq1_sensitivity_bounds.pdf",
  width = 12, height = 10, units = "in",
  pointsize = 12, bg = "white"
)

# TEXT DESCRIPTIVES ============================================================

cat(
"Survey-taking constitutes a substantial part of participants’ browsing:
... 54.6% of all visits in the Lucid sample and 23.9% in the YouGov sample
... are visits to survey-taking sites. This is much more than visits to,
... for example, google.com (6.0% and 11.3%, respectively).
... Survey professionalism is least prevalent in the Facebook sample,
... i.e., 10.2% of all visits are to survey-taking sites,
... which is less than visits to google.com (25.1%)."
)

agg_perc_visits %>%
  filter(visit_type == "Survey sites") %>%
  select(dataset, visit_type, perc) %>%
  print()

agg_perc_visits %>%
  filter(visit_type == "google.com") %>%
  select(dataset, visit_type, perc) %>%
  print()

cat(
"The average percentages of survey visits based on individual-level variation,
... depicted in SM Figure C.10, are slightly smaller than the aggregate percentages
... (Lucid: 49.7%; YouGov: 16.3%; Facebook: 8.4%)"
)
  
avg_perc %>%
  filter(visit_type == "Survey sites") %>%
  select(dataset, visit_type, perc) %>%
  print()

cat(
"A substantial share of our samples’ browsing time is spent on survey sites,
... whether measured as an aggregate percentage (Lucid: 43%; YouGov: 18.2%; Facebook: 8.7%)
... or as an average individual-level percentage (Lucid: 41%; YouGov: 15%; Facebook: 6.8%)."
)

agg_perc_time %>%
  filter(visit_type == "Survey sites") %>%
  select(dataset, visit_type, perc) %>%
  print()

avg_perc_time %>%
  filter(visit_type == "Survey sites") %>%
  select(dataset, visit_type, perc) %>%
  print()

cat(
"In the Lucid sample, the estimates of survey professionals vary between
35.0% and 65.4%, and go up to 71.1% when including subjects categorized
as professionals by any of the measures; in the YouGov sample, between
7.6% and 11.6%, with 16.6% when using any of the categories; and in the Facebook
sample, between 1.7% and 10.9%, with 11.9% when using any of the categories."
)

perc_profs %>% mutate(prevalence = prevalence * 100) %>%
  print()

cat(
"Using the indicator of professionalism yielding the most conservative estimates,
the share of professionals in the Facebook and YouGov samples does not change
substantially, reducing from 1.7% to 1.1%, and from 7.6% to 4.6%, respectively.
Bounds are wider in Lucid, and the estimation for the percentage of professionals
would decrease from 35% to 20.3%"
)

bounds_fb <- all_bounds %>%
  filter(method == "> 50% of browsing time" &
    dataset == "Facebook" &
    bound %in% c(0.5, 0)) %>%
  select(bound, values_lower)
bounds_fb %>%
  print()

bounds_lu <- all_bounds %>%
  filter(method == "> 50% of browsing time" &
    dataset == "Lucid" &
    bound %in% c(0.5, 0)) %>%
  select(bound, values_lower)
bounds_lu %>%
  print()

bounds_yg <- all_bounds %>%
  filter(method == "> 50% of browsing time" &
    dataset == "YouGov" &
    bound %in% c(0.5, 0)) %>%
  select(bound, values_lower)
bounds_yg %>%
  print()
